{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from scipy.optimize import minimize\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 定义整个推荐系统\n",
    "### 1. roll(Theta, X)\n",
    "由于Theta与X都需要进行学习，使用此函数将Theta和X合并到一个矩阵中\n",
    "### 2.unroll(vector)\n",
    "在进行优化时，由于Theta和X的梯度计算公式不同，需对其单独进行计算与更新，使用该函数进行两个矩阵的分离\n",
    "### 3.initParams()\n",
    "对参数Theta与X进行随机初始化\n",
    "### 4.regularize(param)\n",
    "对参数进行正则化\n",
    "$$ L_2 = \\frac{\\lambda}{2} \\sum_{j=1}^{n_m}(\\theta_k^{(j)})^2 + \\frac{\\lambda}{2} \\sum_{i=1}^{n_u}(x_k^{(i)})^2 $$\n",
    "其中 $ n_m $ 表示电影总数， $ n_u $ 表示人数\n",
    "### 5.J(params)\n",
    "计算代价函数\n",
    "$$ J(x^{(1)},...,x^{(n_m)},\\theta^{(1)},...,\\theta^{(n_u)}) = \\frac{1}{2}\\sum_{(i,j):r(i,j)=1} ((\\theta^{(j)})^Tx^{(i)}-y^{(i,j)})^2 + L_2 $$<br/>\n",
    "其中 $ r(i,j)=1 表示第j个用户对第i个电影有评分， y^{(i,j)}=第j个用户对第i个电影的具体评分 $\n",
    "### 6.gradient(params)\n",
    "计算梯度的方法，该函数将在后面作为参数传递给minimize函数，具体参数的求解将由minimize方法实现，该方法只是告知minimize函数梯度的计算方式。\n",
    "$$ \\frac{\\Delta J}{\\Delta x_k^{(i)}}= \\sum_{j:r(i,j)=1}((\\theta^{(j)})^Tx^{(i)}-y^{(i,j)})\\theta_k^{(j)}+\\lambda x_k^{(i)}$$\n",
    "$$ \\frac{\\Delta J}{\\Delta \\theta_k^{(j)}}= \\sum_{i:r(i,j)=1}((\\theta^{(j)})^Tx^{(i)}-y^{(i,j)})x_k^{(i)}+\\lambda \\theta_k^{(j)}$$\n",
    "### 7.train()\n",
    "训练函数，在该函数中进行目标函数的最优化，求解最终的Theta与X的值\n",
    "### 8.predict()\n",
    "预测函数，对新用户进行电影评分的预测\n",
    "$$ pre = (\\theta^{(j)})x^{(i)} + \\mu_i $$\n",
    "其中 $ \\mu_i 为第i部电影的均值标准化，为避免最终的答案中存在赋值，或无意义的答案 $\n",
    "### 9.getTopRecommends(Theta, X, i, count, rated, items)\n",
    "获得推荐，对预测评分为前count的电影进行推荐"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getRecommender(Y, R, params=None, n=10, theLambda=10, maxIter=100):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "        Y - 用户对影片的评分矩阵\n",
    "        R - 用户j是否对影片i评分的矩阵 (0/1)\n",
    "        params - 若有初始化参数，可在此传入(Theta, X)\n",
    "        n - 电影的特征数\n",
    "        theLambda - 正则化参数\n",
    "        maxIter - 最大迭代次数\n",
    "    Returns:\n",
    "        train - 训练函数\n",
    "        predict - 预测函数\n",
    "        getTopRecommends - 获取特定影片的最相似推荐\n",
    "    \"\"\"\n",
    "\n",
    "    # 影片数，用户数\n",
    "    nm, nu = Y.shape\n",
    "\n",
    "    # 标准化影片的评分\n",
    "    mu = np.zeros((Y.shape[0], 1), dtype=np.float)\n",
    "    for i in range(nm):\n",
    "        totalRates = np.sum(Y[i])\n",
    "        validCount = len(np.nonzero(R[i])[0])\n",
    "        mu[i] = totalRates / validCount\n",
    "    Y = Y - mu\n",
    "\n",
    "    def roll(Theta, X):\n",
    "        \"\"\"\n",
    "        对于模型而言，Theta和X都是待学习的参数，需要放在一起直接优化\n",
    "        Args:\n",
    "            Theta - 用户偏好矩阵\n",
    "            X - 电影特征矩阵\n",
    "        Returns:\n",
    "            vector - 折叠后的参数\n",
    "        \"\"\"\n",
    "\n",
    "        #return np.hstack((X.A.T.flatten(), Theta.A.T.flatten()))\n",
    "        return np.hstack((X.A.T.flatten(), Theta.A.T.flatten()))\n",
    "\n",
    "    def unroll(vector):\n",
    "        \"\"\"\n",
    "        Args:\n",
    "            vector 参数向量\n",
    "        Returns:\n",
    "            Theta - 用户偏好矩阵\n",
    "            X - 电影特征矩阵\n",
    "        \"\"\"\n",
    "        X = np.mat(vector[:nm * n].reshape(n, nm).T)\n",
    "        Theta = np.mat(vector[nm * n:].reshape(n, nu).T)\n",
    "        return Theta, X\n",
    "\n",
    "    def initParams():\n",
    "        \"\"\"初始化参数\n",
    "\n",
    "        Returns:\n",
    "            Theta - 用户偏好矩阵\n",
    "            X - 电影特征矩阵\n",
    "        \"\"\"\n",
    "        Theta = np.mat(np.random.rand(nu, n))\n",
    "        X = np.mat(np.random.rand(nm, n))\n",
    "        return Theta, X\n",
    "\n",
    "    def regularize(param):\n",
    "        \"\"\"对参数进行正则化\n",
    "        Args:\n",
    "            param - 参数\n",
    "        Return:\n",
    "            regParam - 正规化后的参数\n",
    "        \"\"\"\n",
    "        return theLambda * 0.5 * np.sum(np.power(param, 2))\n",
    "\n",
    "    def J(params):\n",
    "        \"\"\"代价函数\n",
    "\n",
    "        Args:\n",
    "            params - 参数向量\n",
    "            nu - 用户数\n",
    "            nm - 电影数\n",
    "            n - 特征数\n",
    "        Return:\n",
    "            J - 预测代价\n",
    "        \"\"\"\n",
    "        # 参数展开\n",
    "        Theta, X = unroll(params)\n",
    "        # 计算误差\n",
    "        rows, cols = np.nonzero(R)\n",
    "        # 预测\n",
    "        h = predict(Theta, X)\n",
    "        diff = h - Y\n",
    "        diff[R != 1] = 0\n",
    "        error = 0.5 * np.sum(np.power(diff, 2))\n",
    "        \n",
    "        #  正则化 Theta\n",
    "        regTheta = regularize(Theta)\n",
    "        #  正规化 x\n",
    "        regX = regularize(X)\n",
    "        \n",
    "        return error + regTheta + regX\n",
    "\n",
    "    def gradient(params):\n",
    "        \"\"\"计算梯度\n",
    "\n",
    "        Args:\n",
    "            params - 参数向量\n",
    "        Returns:\n",
    "            grad - 梯度向量\n",
    "        \"\"\"\n",
    "        Theta, X = unroll(params)\n",
    "        \n",
    "        # 当前梯度初始化成0\n",
    "        ThetaGrad = np.mat(np.zeros(Theta.shape))\n",
    "        XGrad = np.mat(np.zeros(X.shape))\n",
    "        \n",
    "        error = predict(Theta, X) - Y\n",
    "        error[R != 1] = 0\n",
    "        \n",
    "        # 这里只需要计算梯度\n",
    "        ThetaGrad = error.T * X + theLambda * Theta\n",
    "        XGrad =  error * Theta + theLambda * X\n",
    "        \n",
    "        return roll(ThetaGrad, XGrad)\n",
    "\n",
    "    def train():\n",
    "        \"\"\"训练\n",
    "\n",
    "        Returns:\n",
    "            Theta - 用户偏好矩阵\n",
    "            X - 电影特征矩阵\n",
    "        \"\"\"\n",
    "        # 初始化参数\n",
    "        if not params:\n",
    "            Theta, X = initParams()\n",
    "        else:\n",
    "            Theta = params['Theta']\n",
    "            X = params['X']\n",
    "            \n",
    "        # 最小化目标函数\n",
    "        res = minimize(J, x0=roll(Theta, X), jac=gradient,\n",
    "                       method='CG', options={'disp': True, 'maxiter': maxIter})\n",
    "        Theta, X = unroll(res.x)\n",
    "        return Theta, X\n",
    "\n",
    "    def predict(Theta, X):\n",
    "        \"\"\"预测\n",
    "        Args:\n",
    "            Theta - 用户偏好矩阵\n",
    "            X - 电影特征矩阵\n",
    "        Return:\n",
    "            h 预测\n",
    "        \"\"\"\n",
    "        return X * Theta.T + mu\n",
    "\n",
    "    def getTopRecommends(Theta, X, i, count, rated, items):\n",
    "        \"\"\"获得推荐\n",
    "\n",
    "        Args:\n",
    "            Theta - 用户偏好矩阵\n",
    "            X - 影片特征矩阵\n",
    "            i - 用户索引\n",
    "            count - 目标推荐数量\n",
    "            rated - 已经评价的影片id\n",
    "            items - 影片库\n",
    "        Returns:\n",
    "            topRecommends - 推荐项目\n",
    "        \"\"\"\n",
    "        predictions = predict(Theta, X)[:, i]\n",
    "        \n",
    "        # 实用pandas的DataFrame可以将不同类型数据放在一个Frame中，方便排序等操作\n",
    "        # 相较而言，numpy的多维数组要求内部类型完全一致\n",
    "        df = pd.DataFrame(data=predictions, columns=['prediction',])\n",
    "        df['movie'] = items\n",
    "        df.sort_values(by='prediction', ascending=False,inplace=True)\n",
    "        # 不推荐已经评过分的影片\n",
    "        df.drop(rated, inplace=True)\n",
    "        \n",
    "        return df[0:count]\n",
    "\n",
    "    return train, predict, getTopRecommends"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.io import loadmat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = loadmat('data/ex8_movies.mat')\n",
    "Y = data['Y']\n",
    "R = data['R']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "movieParams = loadmat('data/ex8_movieParams.mat')\n",
    "nm = movieParams['num_movies'][0,0]\n",
    "n = movieParams['num_features'][0,0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getMovie(line):\n",
    "    return ' '.join(line.split()[1:])\n",
    "movieList = []\n",
    "with open('data/movie_ids.txt') as f:\n",
    "    for line in f:\n",
    "        movieList.append(getMovie(line.strip()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "我的评分:\n",
      "Toy Story (1995)                                   4.0\n",
      "Twelve Monkeys (1995)                              3.0\n",
      "Usual Suspects, The (1995)                         5.0\n",
      "Outbreak (1995)                                    4.0\n",
      "Shawshank Redemption, The (1994)                   5.0\n",
      "While You Were Sleeping (1995)                     3.0\n",
      "Forrest Gump (1994)                                5.0\n",
      "Silence of the Lambs, The (1991)                   2.0\n",
      "Alien (1979)                                       4.0\n",
      "Die Hard 2 (1990)                                  5.0\n",
      "Sphere (1998)                                      5.0\n"
     ]
    }
   ],
   "source": [
    "myRatings = np.mat(np.zeros((nm,1)))\n",
    "\n",
    "myRatings[0] = 4\n",
    "myRatings[97] = 2\n",
    "myRatings[6] = 3\n",
    "myRatings[11] = 5\n",
    "myRatings[53] = 4\n",
    "myRatings[63] = 5\n",
    "myRatings[65] = 3\n",
    "myRatings[68] = 5\n",
    "myRatings[182] = 4\n",
    "myRatings[225] = 5\n",
    "myRatings[354] = 5\n",
    "\n",
    "print('我的评分:')\n",
    "for i in range(nm):\n",
    "    if myRatings[i] > 0:\n",
    "        print('{:<50} {}'.format( movieList[i], myRatings[i].A[0,0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将我们的新用户数据加入\n",
    "Y = np.column_stack((myRatings, Y))\n",
    "R = np.column_stack((myRatings, R)).astype(bool)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "train, predict, getTopRecommends = getRecommender(\n",
    "    Y, R, n=n, theLambda=10.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Warning: Maximum number of iterations has been exceeded.\n",
      "         Current function value: 71167.279528\n",
      "         Iterations: 100\n",
      "         Function evaluations: 155\n",
      "         Gradient evaluations: 155\n"
     ]
    }
   ],
   "source": [
    "Theta, X = train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rated = np.nonzero(myRatings)[0].tolist()\n",
    "# -1 就是我们刚才加入的最新用户\n",
    "topRecommends = getTopRecommends(Theta, X, -1, 10, rated, movieList)\n",
    "topRecommends"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
